import dask.dataframe as dd
import pandas as pd
import numpy as np
from dask.diagnostics import ProgressBar
from datetime import timedelta
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa import stattools
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from dateutil.parser import parse
from prophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
import plotly.offline as po
po.init_notebook_mode()
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
# !wget -q --no-check-certificate '' -O "data.zip"
!unzip -q "/content/drive/MyDrive/Colab Notebooks/nyc_taxi_data.zip" -d "/content/drive/MyDrive/Colab Notebooks"
# df = dd.read_parquet('/content/nyc_taxi_alt', engine='pyarrow')
df = dd.read_parquet('/content/drive/MyDrive/Colab Notebooks/nyc_taxi_alt', engine='pyarrow')
df.columns
Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
'total_amount', 'congestion_surcharge', 'airport_fee'],
dtype='object')
df.shape[0].compute(), df.shape[1]
(179311700, 19)
df.dtypes
with ProgressBar():
missing_values = df.isnull().sum().compute()
print("missing values:\n")
missing_values
[########################################] | 100% Completed | 115.73 s missing values:
VendorID 0 tpep_pickup_datetime 0 tpep_dropoff_datetime 0 passenger_count 3872186 trip_distance 0 RatecodeID 3872186 store_and_fwd_flag 3872186 PULocationID 0 DOLocationID 0 payment_type 0 fare_amount 0 extra 0 mta_tax 0 tip_amount 0 tolls_amount 0 improvement_surcharge 0 total_amount 0 congestion_surcharge 8660504 airport_fee 115760994 dtype: int64
columns_to_drop_null = ['passenger_count', 'RatecodeID', 'store_and_fwd_flag', 'congestion_surcharge']
def fill_nan(partition):
return partition.fillna(0)
def drop_column(partition, column):
return partition.drop(column, axis=1)
def drop_null_values(partition):
return partition.dropna(subset=columns_to_drop_null)
filtered_df = df.map_partitions(drop_null_values)
filtered_df = filtered_df.map_partitions(drop_column, column='airport_fee')
filtered_df = filtered_df.map_partitions(drop_column, column='mta_tax')
filtered_df = filtered_df.map_partitions(drop_column, column='improvement_surcharge')
لأن العمود airport_fee معظم قيمه Null، والأعمدة mta_tax وimprovement_surcharge معظم قيمها ثابتة.
الأعمدة التالية passenger_count، RatecodeID، store_and_fwd_flag، congestion_surcharge تحتوي قيم فارغة لذا حذفنا الأسطر الفارغة على أساسها.
output_dir = '/content/drive/MyDrive/Colab Notebooks/filtered_data.parquet'
with ProgressBar():
filtered_df.to_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 211.98 s
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
def preprocess_date(partition):
return partition.assign(
pickup_before_dropoff=(
(partition['tpep_pickup_datetime'].dt.year >= 2019) & (partition['tpep_pickup_datetime'].dt.year <= 2022) &
(partition['tpep_dropoff_datetime'].dt.year >= 2019) & (partition['tpep_dropoff_datetime'].dt.year <= 2022) &
(partition['tpep_pickup_datetime'] < partition['tpep_dropoff_datetime'])
)
)
filtered_df['tpep_pickup_datetime'] = dd.to_datetime(filtered_df['tpep_pickup_datetime'])
filtered_df['tpep_dropoff_datetime'] = dd.to_datetime(filtered_df['tpep_dropoff_datetime'])
filtered_df = filtered_df.map_partitions(preprocess_date)
def drop_rows(partition):
return partition[partition['pickup_before_dropoff']]
filtered_df = filtered_df.map_partitions(drop_rows)
with ProgressBar():
filtered_df.to_parquet(output_dir, engine='pyarrow')
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 222.07 s
with ProgressBar():
negative_counts = filtered_df[
(filtered_df['extra'] < 0) |
(filtered_df['tip_amount'] < 0) | (filtered_df['tolls_amount'] < 0) |
(filtered_df['total_amount'] < 0) |
(filtered_df['congestion_surcharge'] < 0)
].count().compute()
negative_counts
def drop_negative_rows(partition):
return partition[(partition['extra'] >= 0) &
(partition['passenger_count'] >= 0) &
(partition['trip_distance'] >= 0) &
(partition['tip_amount'] >= 0) &
(partition['tolls_amount'] >= 0) & (partition['total_amount'] >= 0) &
(partition['congestion_surcharge'] >= 0)]
filtered_df = filtered_df.map_partitions(drop_negative_rows)
with ProgressBar():
filtered_df = filtered_df.repartition(partition_size="64MB")
filtered_df.to_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 71.11 s [########################################] | 100% Completed | 167.56 s
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
with ProgressBar():
stats = filtered_df.describe().compute()
stats
[########################################] | 100% Completed | 153.57 s
| VendorID | passenger_count | trip_distance | RatecodeID | PULocationID | DOLocationID | payment_type | fare_amount | extra | tip_amount | tolls_amount | total_amount | congestion_surcharge | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.742050e+08 | 1.742050e+08 | 1.742050e+08 | 1.742050e+08 | 1.742050e+08 | 1.742050e+08 | 1.742050e+08 | 1.742050e+08 | 1.742050e+08 | 1.742050e+08 | 1.742050e+08 | 1.742050e+08 | 1.742050e+08 |
| mean | 1.669321e+00 | 1.488854e+00 | 3.109136e+00 | 1.151356e+00 | 1.641665e+02 | 1.620703e+02 | 1.259357e+00 | 1.354792e+01 | 1.097812e+00 | 2.363918e+00 | 4.014292e-01 | 1.976463e+01 | 2.258285e+00 |
| std | 4.704577e-01 | 1.113451e+00 | 3.643040e+01 | 3.020670e+00 | 6.568577e+01 | 7.027044e+01 | 4.608893e-01 | 1.730048e+02 | 3.790361e+01 | 1.111647e+01 | 1.803824e+00 | 1.933940e+02 | 7.401891e-01 |
| min | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | -6.575000e+02 | -7.500000e+00 | -1.100000e+02 | -4.950000e+01 | -6.625000e+02 | -2.500000e+00 |
| 25% | 2.000000e+00 | 1.000000e+00 | 1.210000e+00 | 1.000000e+00 | 1.370000e+02 | 1.320000e+02 | 1.000000e+00 | 8.500000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 1.280000e+01 | 2.500000e+00 |
| 50% | 2.000000e+00 | 1.000000e+00 | 2.090000e+00 | 1.000000e+00 | 1.630000e+02 | 1.630000e+02 | 1.000000e+00 | 1.200000e+01 | 1.000000e+00 | 2.360000e+00 | 0.000000e+00 | 1.716000e+01 | 2.500000e+00 |
| 75% | 2.000000e+00 | 2.000000e+00 | 4.200000e+00 | 1.000000e+00 | 2.360000e+02 | 2.360000e+02 | 2.000000e+00 | 2.400000e+01 | 2.500000e+00 | 4.550000e+00 | 0.000000e+00 | 3.324000e+01 | 2.500000e+00 |
| max | 2.000000e+00 | 1.120000e+02 | 1.843408e+05 | 9.900000e+01 | 2.650000e+02 | 2.650000e+02 | 5.000000e+00 | 9.983100e+05 | 5.000008e+05 | 1.414920e+05 | 9.565500e+02 | 1.084772e+06 | 4.500000e+00 |
# Define the function for outlier detection
def detect_outliers(partition, column_name, z_score_threshold):
mean = stats.loc['mean', column_name]
std = stats.loc['std', column_name]
partition['z_score'] = (partition[column_name] - mean) / std
outliers = partition[abs(partition['z_score']) > z_score_threshold]
return outliers
# Define the function for outlier detection
def delete_outliers(partition, column_name, z_score_threshold):
mean = stats.loc['mean', column_name]
std = stats.loc['std', column_name]
z = (partition[column_name] - mean) / std
outliers = partition[abs(z) <= z_score_threshold]
return outliers
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='trip_distance', z_score_threshold=2)
with ProgressBar():
filtered_df2 = filtered_df2.repartition(partition_size="64MB")
[########################################] | 100% Completed | 77.52 s
with ProgressBar():
z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
[########################################] | 100% Completed | 74.74 s
z_score.reset_index()
filtered_df = filtered_df.map_partitions(delete_outliers, column_name='trip_distance', z_score_threshold=6)
with ProgressBar():
filtered_df = filtered_df.repartition(partition_size="64MB")
[########################################] | 100% Completed | 81.24 s
with ProgressBar():
filtered_df.to_parquet(output_dir, engine='pyarrow')
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 219.96 s
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='passenger_count', z_score_threshold=1)
with ProgressBar():
filtered_df2 = filtered_df2.repartition(partition_size="64MB")
with ProgressBar():
z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
filtered_df = filtered_df.map_partitions(delete_outliers, column_name='passenger_count', z_score_threshold=10)
with ProgressBar():
filtered_df = filtered_df.repartition(partition_size="64MB")
[########################################] | 100% Completed | 77.49 s
with ProgressBar():
filtered_df.to_parquet(output_dir, engine='pyarrow')
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 189.43 s
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='fare_amount', z_score_threshold=1)
with ProgressBar():
filtered_df2 = filtered_df2.repartition(partition_size="64MB")
with ProgressBar():
z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
filtered_df = filtered_df.map_partitions(delete_outliers, column_name='fare_amount', z_score_threshold=100)
with ProgressBar():
filtered_df = filtered_df.repartition(partition_size="64MB")
[########################################] | 100% Completed | 88.65 s
with ProgressBar():
filtered_df.to_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 200.32 s
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
filtered_df2 = filtered_df2.map_partitions(detect_outliers, column_name='extra', z_score_threshold=1)
with ProgressBar():
filtered_df2 = filtered_df2.repartition(partition_size="64MB")
with ProgressBar():
z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
filtered_df = filtered_df.map_partitions(delete_outliers, column_name='extra', z_score_threshold=100)
with ProgressBar():
filtered_df = filtered_df.repartition(partition_size="64MB")
filtered_df.to_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 104.70 s [########################################] | 100% Completed | 214.02 s
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='tip_amount', z_score_threshold=1)
with ProgressBar():
filtered_df2 = filtered_df2.repartition(partition_size="64MB")
with ProgressBar():
z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='tolls_amount', z_score_threshold=1)
with ProgressBar():
filtered_df2 = filtered_df2.repartition(partition_size="64MB")
with ProgressBar():
z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='total_amount', z_score_threshold=1)
with ProgressBar():
filtered_df2 = filtered_df2.repartition(partition_size="64MB")
with ProgressBar():
z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='congestion_surcharge', z_score_threshold=1)
with ProgressBar():
filtered_df2 = filtered_df2.repartition(partition_size="64MB")
with ProgressBar():
z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
with ProgressBar():
filtered_df.to_parquet(output_dir, engine='pyarrow')
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
تم اكتشاف القيم المتطرفة باستخدام معيار ال z-score الذي يمثل نسبة انحراف القيمة عن الانحراف المعياري. حيث وجدنا انحرافاً في بعض القيم في الأعمدة التالية:
تم حذف القيم التي انحرافها أكبر من 10 لأنها حالات نادرة وربما تؤثر قيمها في الدراسة التحليلية.
تم حذف قيمتين بانحراف أكبر من 6 حيث انحرافهما (84 و98) (وهو انحراف كبير بالنسبة لعدد ركاب تكسي أجرة🙃).
تم حذف القيم التي انحرافها أكبر من 100 نظراً لأن معظم القيم انحرافها متقارب وأقل من هذه القيمة عدا بعض القيم الشاذة.
تم حذف القيم التي انحرافاتها أكبر من 100 (اعتبرنا الحد 100 قيمة منطقية مقارنة بباقي القيم ولأن القيمة نقدية)، وهي قيمة وحيدة تحقق الحالة (13191.35).
with ProgressBar():
rate_code_counts = filtered_df['RatecodeID'].value_counts(dropna=False).compute()
store_fwd_flag_counts = filtered_df['store_and_fwd_flag'].value_counts(dropna=False).compute()
payment_type_counts = filtered_df['payment_type'].value_counts(dropna=False).compute()
[########################################] | 100% Completed | 15.39 s [########################################] | 100% Completed | 30.57 s [########################################] | 100% Completed | 16.67 s
expected_rate_codes = [1, 2, 3, 4, 5, 6]
unexpected_rate_codes = rate_code_counts[~rate_code_counts.index.isin(expected_rate_codes)]
unexpected_rate_codes
99.0 163346 Name: RatecodeID, dtype: int64
expected_store_fwd_flags = ['Y', 'N']
unexpected_store_fwd_flags = store_fwd_flag_counts[~store_fwd_flag_counts.index.isin(expected_store_fwd_flags)]
expected_payment_types = [1, 2, 3, 4, 5, 6]
unexpected_payment_types = payment_type_counts[~payment_type_counts.index.isin(expected_payment_types)]
filtered_df = filtered_df.map_partitions(
lambda partition: partition[
~partition['RatecodeID'].isin(unexpected_rate_codes.index) &
~partition['store_and_fwd_flag'].isin(unexpected_store_fwd_flags.index) &
~partition['payment_type'].isin(unexpected_payment_types.index)
]
)
filtered_df = filtered_df.map_partitions(lambda partition: partition.drop_duplicates())
with ProgressBar():
filtered_df = filtered_df.repartition(partition_size="64MB")
filtered_df.to_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 100.92 s [########################################] | 100% Completed | 187.63 s
حذف قيم العمود RatecodeID، عدا القيم المحددة للتصنيف ([1, 2, 3, 4, 5, 6]).
حذف قيم العمود store_and_fwd_flag، عدا القيمتين (['Y', 'N'])
حذف قيم العمود payment_type، عدا القيم المحددة للتصنيف ([1, 2, 3, 4, 5, 6])
cleaned_data = dd.read_parquet(output_dir, engine='pyarrow')
with ProgressBar():
shape = cleaned_data.shape[0].compute()
shape
[########################################] | 100% Completed | 61.72 s
174041065
cleaned_data.columns
with ProgressBar():
cleaned_data['pickup_hour'] = cleaned_data['tpep_pickup_datetime'].dt.hour
cleaned_data['pickup_day'] = cleaned_data['tpep_pickup_datetime'].dt.day
cleaned_data['pickup_month'] = cleaned_data['tpep_pickup_datetime'].dt.month
cleaned_data['pickup_year'] = cleaned_data['tpep_pickup_datetime'].dt.year
result_df = cleaned_data.groupby(['pickup_year', 'pickup_month', 'pickup_day', 'pickup_hour', 'VendorID']).agg({'total_amount': 'sum', 'VendorID': 'count'}).compute()
[########################################] | 100% Completed | 93.75 s
result_df = result_df.rename(columns={'VendorID': 'trip_count', 'total_amount': 'total_revenue'}).reset_index()
result_df.pickup_year.unique()
array([2019, 2020, 2021, 2022])
result_df['datetime'] = pd.to_datetime(result_df[['pickup_year', 'pickup_month', 'pickup_day', 'pickup_hour']].astype(str).agg('-'.join, axis=1) + ':00:00')
result_df.set_index('datetime', inplace=True)
df_resampled = result_df.groupby('VendorID').resample('H').agg(
{'total_revenue': 'sum', 'trip_count': 'sum'}).reset_index()
df_resampled.set_index('datetime', inplace=True)
df_resampled.sort_index(inplace=True)
df_resampled.index
DatetimeIndex(['2019-01-01 07:00:00', '2019-01-01 08:00:00',
'2019-01-01 09:00:00', '2019-01-01 10:00:00',
'2019-01-01 11:00:00', '2019-01-01 12:00:00',
'2019-01-01 13:00:00', '2019-01-01 14:00:00',
'2019-01-01 15:00:00', '2019-01-01 16:00:00',
...
'2022-12-31 19:00:00', '2022-12-31 19:00:00',
'2022-12-31 20:00:00', '2022-12-31 20:00:00',
'2022-12-31 21:00:00', '2022-12-31 21:00:00',
'2022-12-31 22:00:00', '2022-12-31 22:00:00',
'2022-12-31 23:00:00', '2022-12-31 23:00:00'],
dtype='datetime64[ns]', name='datetime', length=69666, freq=None)
df_resampled.to_csv('/content/drive/MyDrive/Colab Notebooks/time_series.csv', index=True)
ضبط آلة الزمن
Check if there are any missing values in the resampled data
missing_values = df_resampled.isnull().sum()
print("missing values:", missing_values)
missing values: VendorID 0 total_revenue 0 trip_count 0 dtype: int64
Verify that the time intervals between consecutive data points are regular
time_diff = df_resampled.index[1] - df_resampled.index[0]
freq = str(time_diff)
print("Inferred Frequency:", freq)
Inferred Frequency: 0 days 01:00:00
time_series = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/time_series.csv')
df_long = pd.melt(time_series, id_vars=['datetime'], value_vars=['VendorID', 'total_revenue', 'trip_count'])
# Save the long format DataFrame to a CSV file
df_long.to_csv('/content/drive/MyDrive/Colab Notebooks/long_format_time_series.csv', index=False)
zone_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/taxi_zone_lookup.csv')
zone_data.columns
Index(['LocationID', 'Borough', 'Zone', 'service_zone'], dtype='object')
def map_zones(partition, column, new_column1, new_column2):
partition = partition.merge(zone_data[['LocationID', 'Zone', 'Borough']], left_on=column, right_on='LocationID', how='inner')
partition = partition.rename(columns={'Zone': new_column1, 'Borough': new_column2})
return partition
cleaned_data_with_features = cleaned_data.map_partitions(map_zones, column = 'PULocationID',
new_column1='source_zone',new_column2='source_borough')
cleaned_data_with_features = cleaned_data_with_features.map_partitions(map_zones, column = 'DOLocationID',
new_column1='destination_zone',new_column2='destination_borough')
cleaned_data_with_features.columns
Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
'tip_amount', 'tolls_amount', 'total_amount', 'congestion_surcharge',
'pickup_before_dropoff', 'pickup_hour', 'pickup_day', 'pickup_month',
'pickup_year', 'LocationID_x', 'source_zone', 'source_borough',
'LocationID_y', 'destination_zone', 'destination_borough'],
dtype='object')
columns_to_drop = ['LocationID_y', 'LocationID_x']
def drop_columns(partition):
return partition.drop(columns_to_drop, axis=1)
cleaned_data_with_features = cleaned_data_with_features.map_partitions(drop_columns)
def create_location_pair(partition):
partition['location_pair'] = partition[['source_zone', 'destination_zone']].apply(
lambda x: ','.join(sorted(x)), axis=1
)
return partition
cleaned_data_with_features = cleaned_data_with_features.map_partitions(create_location_pair)
payment_type_mapping = {
1: 'Credit card',
2: 'Cash',
3: 'No charge',
4: 'Dispute',
5: 'Unknown',
6: 'Voided trip'
}
def map_payment_type_name(partition):
partition['payment_type_name'] = partition['payment_type'].map(payment_type_mapping)
return partition
cleaned_data_with_features = cleaned_data_with_features.map_partitions(map_payment_type_name)
vendor_mapping = {
1: 'Creative Mobile Technologies, LLC',
2: 'VeriFone Inc.'
}
def map_vendor(partition):
partition['vendor'] = partition['VendorID'].map(vendor_mapping)
return partition
cleaned_data_with_features = cleaned_data_with_features.map_partitions(map_vendor)
rate_code_mapping = {
1: 'Standard rate',
2: 'JFK',
3: 'Newark',
4: 'Nassau or Westchester',
5: 'Negotiated fare',
6: 'Group ride'
}
def map_rate_code(partition):
partition['rate_code'] = partition['RatecodeID'].map(rate_code_mapping)
return partition
cleaned_data_with_features = cleaned_data_with_features.map_partitions(map_rate_code)
output_cleaned_data_with_features= "/content/drive/MyDrive/Colab Notebooks/cleaned_data_with_features.parquet"
with ProgressBar():
cleaned_data_with_features.to_parquet(output_cleaned_data_with_features, engine='pyarrow')
cleaned_data_with_features = dd.read_parquet(output_cleaned_data_with_features, engine='pyarrow')
[########################################] | 100% Completed | 20m 41s
with ProgressBar():
location_pair_counts = cleaned_data_with_features['location_pair'].value_counts(dropna=False).compute()
[########################################] | 100% Completed | 33.33 s
location_pair_counts_df = location_pair_counts.reset_index().rename(columns={"index":'location_pair', "location_pair": 'count'})
max_count = location_pair_counts_df['count'].max()
mean_count = location_pair_counts_df['count'][1:].mean()
min_count = location_pair_counts_df['count'].min()
quartiles = np.round(np.percentile([min_count, mean_count, max_count/2], [10, 53, 60, 100]))
thresholds = [
min_count,
int(quartiles[0]), #rare
int(quartiles[1]), #less-common
int(quartiles[2]), #common
max_count #more-common
]
print(thresholds)
[1, 1172, 71088, 223296, 2186115]
(location_pair_counts_df[((location_pair_counts_df['count'] > thresholds[0]) & (location_pair_counts_df['count'] < thresholds[1]))]['count'].sum() ,
location_pair_counts_df[((location_pair_counts_df['count'] > thresholds[1]) & (location_pair_counts_df['count'] < thresholds[2]))]['count'].sum() ,
location_pair_counts_df[((location_pair_counts_df['count'] > thresholds[2]) & (location_pair_counts_df['count'] < thresholds[3]))]['count'].sum(),
location_pair_counts_df[((location_pair_counts_df['count'] > thresholds[3]) & (location_pair_counts_df['count'] < thresholds[4]))]['count'].sum()
)
(2985659, 41283605, 58510933, 68556780)
def assign_trip_class(partition):
partition['trip_class'] = pd.cut(partition['location_pair'].map(location_pair_counts), bins=thresholds, labels=['rare', 'less-common', 'common', 'more-common'])
return partition
cleaned_data_with_features = cleaned_data_with_features.map_partitions(assign_trip_class)
cleaned_data_with_features['trip_duration'] = (cleaned_data_with_features['tpep_dropoff_datetime'] - cleaned_data_with_features['tpep_pickup_datetime']) / timedelta(minutes=1)
cleaned_data_with_features['trip_distance_km'] = cleaned_data_with_features['trip_distance'] * 1.609344
cleaned_data_with_features.columns
with ProgressBar():
cleaned_data_with_features.to_parquet(output_cleaned_data_with_features, engine='pyarrow')
[########################################] | 100% Completed | 427.90 s
featured_data = dd.read_parquet('nytchallenge/featured_data.parquet', engine='pyarrow')
with ProgressBar():
vendor_counts = featured_data['vendor'].value_counts().compute()
[########################################] | 100% Completed | 74.71 s
fig = px.pie(vendor_counts, values=vendor_counts.values, names=vendor_counts.index, title='Vendor Share')
fig.show()
بعد إجراء بعض عمليات البحث وجدنا أن وكالة verifone inc هي الأقدم في السوق وقد تأسست بتاريخ 1992 باسم مختلف من قبل سائق تكسي أساساً وقد أصبح اليوم اسمها curb وقد دخلت إلى تكنولوجيا تطبيقات حجز الرحلات لتنافس شركات uber و lyft وأخيراً تعاقدت مع شركة uber في شهر آذار عام 2022 مما قد يكون سببا في سرطتها على السوق مقارنةً بشركة creative mobile technology, llc والتي تأسست في عام 2005 والتي تمتلك العديد من الحلول التكنولوجية مثل تطبيقات الهواتف المحمولة ومواقع الويب وغيرها.
pio.write_image(fig, 'Vendor Share.png', format='png')
view the share of each Borough (source_borough, destination_borough)
with ProgressBar():
source_borough_counts = featured_data['source_borough'].value_counts().compute()
destination_borough_counts = featured_data['destination_borough'].value_counts().compute()
[########################################] | 100% Completed | 75.75 s [########################################] | 100% Completed | 74.41 s
borough_counts = pd.concat([source_borough_counts, destination_borough_counts])
fig = px.pie(borough_counts, values=borough_counts.values, names=borough_counts.index, title='Borough Share')
fig.show()
تُعدّ منهاتن الأكثر شهرة بين جميع البلديات والأكثر ازدحاماً والأكثر زيارةً وقد يكون هذا السبب المحتمل لسيطرتها على السوق.
pio.write_image(fig, 'Borough Share.png', format='png')
with ProgressBar():
payment_type_counts = featured_data['payment_type_name'].value_counts().compute()
[########################################] | 100% Completed | 73.68 s
fig = px.bar(payment_type_counts, x=payment_type_counts.index, y=payment_type_counts.values, title='Payment Methods')
fig.update_layout(
yaxis=dict(
title='Count'
)
)
fig.show()
وذلك يعود إلى أن كلتا الوكالتين هي من الوكالات التكنولوجية الحديثة كما أنه وبعد إجراء القليل من البحث تبيّن لنا أنّ أغلب سائقي سيارات الأجرة في مدينة نيويورك يعتمدون الـ credit cards كوسيلة دفع.
pio.write_image(fig, 'Payment Methods.png', format='png')
with ProgressBar():
payment_type_trip_class_counts = featured_data.groupby(['payment_type_name', 'trip_class']).size().reset_index().rename(columns={0: 'count'}).compute()
[########################################] | 100% Completed | 383.37 s
fig = px.sunburst(payment_type_trip_class_counts, path=['trip_class', 'payment_type_name'], values='count')
fig.show()
pio.write_image(fig, 'payment type of trip class.png', format='png')
with ProgressBar():
correlation_matrix = featured_data[['total_amount', 'tip_amount', 'fare_amount', 'passenger_count', 'trip_duration', 'trip_distance']].corr().compute()
correlation_matrix
[########################################] | 100% Completed | 161.47 s
| total_amount | tip_amount | fare_amount | passenger_count | trip_duration | trip_distance | |
|---|---|---|---|---|---|---|
| total_amount | 1.000000 | 0.666808 | 0.976456 | 0.014598 | 0.148588 | 0.845064 |
| tip_amount | 0.666808 | 1.000000 | 0.521965 | 0.001175 | 0.080030 | 0.528000 |
| fare_amount | 0.976456 | 0.521965 | 1.000000 | 0.015325 | 0.150626 | 0.827287 |
| passenger_count | 0.014598 | 0.001175 | 0.015325 | 1.000000 | 0.015952 | 0.017038 |
| trip_duration | 0.148588 | 0.080030 | 0.150626 | 0.015952 | 1.000000 | 0.153610 |
| trip_distance | 0.845064 | 0.528000 | 0.827287 | 0.017038 | 0.153610 | 1.000000 |
heatmap_trace = go.Heatmap(
z=correlation_matrix.values,
x=correlation_matrix.columns,
y=correlation_matrix.columns,
colorscale='Viridis'
)
layout = go.Layout(
title='Correlation Matrix Heatmap',
)
fig = go.Figure(data=[heatmap_trace], layout=layout)
fig.show()
نلاحظ أن المسافة مرتبطة جداً بالأجرة الكلية والأجرة الفعلية، وبالطبع الأجرة الفعلية مرتبطة جدا بالأجرة الكلية. والعلاقة بين عدد الركاب وباقي السمات ضعيف (لأنه ليس لديهم تكسي ركاب 😬). العلاقة بين المسافة ومدة الرحلة ليست مرتبطة جداً لأن هناك عوامل خارجية قد تؤثر على مدة الرحلة (السرعة، الازدحام، المسافات الطويلة غالباً تكون رحلات سفر وحدود السرعة فيها عالية لخلوها من الازدحام الشديد، ...)
pio.write_image(fig, 'Correlation.png', format='png')
subset_data = featured_data[['trip_class', 'total_amount', 'trip_duration', 'trip_distance']]
with ProgressBar():
stats_per_class = subset_data.groupby('trip_class').agg({
'total_amount': ['mean', 'median', 'std'],
'trip_duration': ['mean', 'median', 'std'],
'trip_distance': ['mean', 'median', 'std']
}, shuffle='tasks').compute()
stats_per_class = stats_per_class.stack().reset_index()
stats_per_class2 = stats_per_class.melt(
id_vars=['trip_class', 'level_1'],
value_vars=['total_amount', 'trip_duration', 'trip_distance'],
var_name='metric',
value_name='value'
)
title = 'Mean, Median, and Standard Deviation for Total Cost, Trip Duration, and Distance Traveled per Trip Class'
fig = px.bar(
stats_per_class2,
x='metric',
y='value',
color='value',
facet_row='trip_class',
facet_col='level_1',
title=title,
barmode='group',
)
fig.update_yaxes(matches='y')
fig.update_layout(height=800)
fig.show()
pio.write_image(fig, 'statistics DataFrame.png', format='png')
time_series_df = pd.read_csv('nytchallenge/time_series.csv', parse_dates=["datetime"], index_col="datetime")
time_series_df.columns
Index(['VendorID', 'total_revenue', 'trip_count'], dtype='object')
vendor1_data = time_series_df[time_series_df['VendorID'] == 1][['total_revenue','trip_count']]
vendor2_data = time_series_df[time_series_df['VendorID'] == 2][['total_revenue','trip_count']]
*Autocorrelation*
is just the correlation of the data with itself. Autocorrelation measures a set of current values against a set of past values to see if they correlate. It is heavily used in time series analysis and forecasting
acf_values = acf(vendor1_data['total_revenue'], nlags=48)
lags = np.arange(24*2)
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Scatter(x=lags, y=acf_values, mode='lines', name='ACF'), row=1, col=1)
fig.update_layout(height=600, width=800, title_text='Vendor 1 Autocorrelation Function - Total Revenue')
fig.update_xaxes(title_text='Lag')
fig.update_yaxes(title_text='Autocorrelation')
fig.show()
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
plot_acf(vendor1_data['total_revenue'], lags=48, ax=axes[0, 0], alpha=0.5)
axes[0, 0].set_title('Autocorrelation for Vendor 1 - Total Revenue')
plot_acf(vendor2_data['total_revenue'], lags=48, ax=axes[0, 1], alpha=0.5)
axes[0, 1].set_title('Autocorrelation for Vendor 2 - Total Revenue')
plot_pacf(vendor1_data['total_revenue'], lags=48, ax=axes[1, 0], alpha=0.5)
axes[1, 0].set_title('Partial Autocorrelation for Vendor 1 - Total Revenue')
plot_pacf(vendor2_data['total_revenue'], lags=48, ax=axes[1, 1], alpha=0.5)
axes[1, 1].set_title('Partial Autocorrelation for Vendor 2 - Total Revenue')
plt.tight_layout()
plt.show()
a window-size equal to the seasonal duration (ex: 12 for a month-wise series), will effectively nullify the seasonal effect.
vendor1_smooth = vendor1_data.rolling(window = 24*30, center=True, min_periods=1).mean() # a rolling window of 24*30 hour (1 month)
vendor2_smooth = vendor2_data.rolling(window = 24*30, center=True, min_periods=1).mean() # a rolling window of 24 hour (1 day)
vendor1_smooth_df = pd.DataFrame({'Date': vendor1_data.index, 'Smoothed Total Revenue': vendor1_smooth['total_revenue'], 'Smoothed Trip Count':vendor1_smooth['trip_count']})
vendor2_smooth_df = pd.DataFrame({'Date': vendor2_data.index, 'Smoothed Total Revenue': vendor2_smooth['total_revenue'], 'Smoothed Trip Count':vendor2_smooth['trip_count']})
fig = make_subplots(rows=2, cols=2, subplot_titles=[
'Smoothed Time Series for Vendor 1 - Total Revenue',
'Smoothed Time Series for Vendor 1 - Trip Count',
'Smoothed Time Series for Vendor 2 - Total Revenue',
'Smoothed Time Series for Vendor 2 - Trip Count'
])
fig.add_trace(go.Scatter(x=vendor1_smooth_df['Date'], y=vendor1_smooth_df['Smoothed Total Revenue']), row=1, col=1)
fig.add_trace(go.Scatter(x=vendor1_smooth_df['Date'], y=vendor1_smooth_df['Smoothed Trip Count']), row=1, col=2)
fig.add_trace(go.Scatter(x=vendor2_smooth_df['Date'], y=vendor2_smooth_df['Smoothed Total Revenue']), row=2, col=1)
fig.add_trace(go.Scatter(x=vendor2_smooth_df['Date'], y=vendor2_smooth_df['Smoothed Trip Count']), row=2, col=2)
fig.update_layout(height=600, width=1200, title_text="Smoothed Time Series Subplots")
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=vendor1_data.index, y=vendor1_data["total_revenue"], mode='lines', name='Original', line=dict(color='#379BDB')))
fig.add_trace(go.Scatter(x=vendor1_data.index, y=vendor1_smooth_df["Smoothed Total Revenue"], mode='lines', name='Rolling Mean', line=dict(color='#D22A0D')))
# Update layout
fig.update_layout(
title="Rolling Mean & Original",
xaxis_title="Date",
yaxis_title="total revenue",
showlegend=True,
legend=dict(x=0.02, y=0.98, bgcolor='rgba(255, 255, 255, 0.5)'),
plot_bgcolor='rgba(0, 0, 0, 0)',
paper_bgcolor='rgba(0, 0, 0, 0)',
height=500,
width=900
)
xcoords = ['2019-01-01', '2019-02-01', '2019-03-01', '2019-04-01', '2019-05-01',
'2019-06-01', '2019-07-01', '2019-08-01', '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
'2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-05-01',
'2020-06-01', '2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01', '2020-11-01', '2020-12-01',
'2021-01-01', '2021-02-01', '2021-03-01', '2021-04-01', '2021-05-01',
'2021-06-01', '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01', '2021-11-01', '2021-12-01'
'2022-01-01', '2022-02-01', '2022-03-01', '2022-04-01', '2022-05-01',
'2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01'] # List of x-coordinates for vertical lines
for xc in xcoords:
fig.add_shape(type='line', x0=xc, y0=vendor1_smooth_df['Smoothed Total Revenue'].min(),
x1=xc, y1=vendor1_smooth_df['Smoothed Total Revenue'].max(),
line=dict(color='black', dash='dash'))
fig.show()
fig = make_subplots(rows=2, cols=1, subplot_titles=[
'Vendor 1 - Total Revenue and Trip Count',
'Vendor 2 - Total Revenue and Trip Count'
])
fig.add_trace(go.Scatter(x=vendor1_data.index, y=vendor1_data['total_revenue'], name='Vendor 1 - Total Revenue'), row=1, col=1)
fig.add_trace(go.Scatter(x=vendor1_data.index, y=vendor1_data['trip_count'], name='Vendor 1 - Trip Count'), row=1, col=1)
fig.add_trace(go.Scatter(x=vendor2_data.index, y=vendor2_data['total_revenue'], name='Vendor 2 - Total Revenue'), row=2, col=1)
fig.add_trace(go.Scatter(x=vendor2_data.index, y=vendor2_data['trip_count'], name='Vendor 2 - Trip Count'), row=2, col=1)
fig.update_layout(height=600, width=900, title_text="Vendor Data Subplots", showlegend=True)
fig.show()
قمنا هنا برسم السلسلة الزمنية على مدار شهري، ولاحظنا أنّ هناك انخفاض ملحوظ بعدد الرحلات والإيرادات يتبعه ارتفاع بنسبة قليلة من أجل كلتا الوكالتين ما بين شهري نيسان 4 من عام 2020 وتشرين الأول 9 من عام 2020، وبعد إجراء بعض الأبحاث وجدنا أن فترة فيروس كورونا COVID-19 قد أثرت تأثيراً سلبيّاً كبيراً على وكالات وسائقي سيارات الأجرة بشكل عام حتى أنهم في ايرلاند خرجوا باحتجاجات، وبحسب الإحصائيات فإن أكثر من نصف سائقي سيارات الأجرة لم يعودوا إلى عملهم بعد الجائحة وهذا يفسر أنه حتى بعدما عادت الإيرادات وعدد الرحلات لترتفع من جديد إلا أنها لم تعد كما كانت عليه قبل فترة الجائحة.
pio.write_image(fig, 'Vendor Data Subplots.png', format='png')
!jupyter nbconvert --to HTML '/content/drive/MyDrive/Colab Notebooks/DmH2.ipynb'
[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/DmH2.ipynb to HTML /usr/local/lib/python3.10/dist-packages/nbconvert/filters/widgetsdatatypefilter.py:71: UserWarning: Your element with mimetype(s) dict_keys(['application/vnd.plotly.v1+json']) is not able to be represented. warn( [NbConvertApp] Writing 5005210 bytes to /content/drive/MyDrive/Colab Notebooks/DmH2.html
Stationarity is an important characteristic of time series. A time series is said to be stationary if its statistical properties do not change over time. In other words, it has constant mean and variance, and covariance is independent of time. Ideally, we want to have a stationary time series for modelling. if not stationarity we can make different transformations to make them stationary.
adfuller
it test the null hypothesis that a unit root is present. If it is, then p > 0, and the process is not stationary. Otherwise, p = 0, the null hypothesis is rejected, and the process is considered to be stationary.
adf, pvalue, _, _, critical_values, _ = stattools.adfuller(vendor1_data['total_revenue'],
autolag='AIC')
print(pvalue)
print('Probably ' + ('Non-Stationary' if pvalue > 0.05 else 'Stationary'))
2.8418965620348085e-07 Probably Stationary
train_size = int(len(vendor1_data) * 0.8) # 80% for training, 20% for testing
train_data = vendor1_data.iloc[:train_size]
test_data = vendor1_data.iloc[train_size:]
# Model the trend, seasonality, and residuals using seasonal_decompose
decomposition = seasonal_decompose(train_data['total_revenue'], model='additive', period=24)
trend = decomposition.trend
seasonal = decomposition.seasonal
residuals = decomposition.resid
# Create subplots
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, subplot_titles=['Trend', 'Seasonality', 'Residuals'])
# Add traces for trend, seasonality, and residuals
fig.add_trace(go.Scatter(x=train_data.index, y=trend, mode='lines', name='Trend'), row=1, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=seasonal, mode='lines', name='Seasonality'), row=2, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=residuals, mode='lines', name='Residuals'), row=3, col=1)
# Update layout
fig.update_layout(height=800, width=1200, title_text="Seasonal Decomposition of Vendor 1 - Total Revenue")
# Show the figure
fig.show()
from itertools import product
ps = range(3, 7, 1)
ds = range(1, 2, 1)
qs = range(0, 4, 1)
order_list = list(product(ps, ds, qs))
for i, order in enumerate(order_list):
order_list[i] = (order[0], order[1] ,order[2])
from typing import Union
from tqdm.notebook import tqdm
def optimize_SARIMAX(endog: Union[pd.Series, list], order_list: list, test_data: Union[pd.Series, list]):
results = []
lowest_mse = float('inf')
best_predictions = None
for order in tqdm(order_list):
try:
sarimax_model = SARIMAX(endog, order=order, simple_differencing=False).fit(disp=False)
test_predictions = sarimax_model.predict(start=test_data.index[0], end=test_data.index[-1])
except:
continue
aic = sarimax_model.aic
mse = mean_squared_error(test_data['total_revenue'], test_predictions)
if mse < lowest_mse:
lowest_mse = mse
best_predictions = test_predictions
results.append([order, aic, mse])
result_df = pd.DataFrame(results)
result_df.columns = ['(p, d, q)', 'AIC', 'mse']
#Sort in ascending order, lower AIC is better
# result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)
return result_df, best_predictions
result_df3, best_predictions3 = optimize_SARIMAX(train_data['total_revenue'], order_list, test_data)
result_df3
0%| | 0/16 [00:00<?, ?it/s]
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
| (p, d, q) | AIC | mse | |
|---|---|---|---|
| 0 | (3, 1, 0) | 563122.708034 | 3.056047e+08 |
| 1 | (3, 1, 1) | 563121.817440 | 3.059815e+08 |
| 2 | (3, 1, 3) | 556066.491065 | 2.389156e+08 |
| 3 | (4, 1, 0) | 563089.315956 | 3.099661e+08 |
| 4 | (4, 1, 2) | 561383.434699 | 3.265998e+08 |
| 5 | (5, 1, 1) | 557991.144863 | 2.404729e+08 |
| 6 | (5, 1, 2) | 557957.945055 | 2.404858e+08 |
| 7 | (6, 1, 0) | 561817.851815 | 2.955761e+08 |
min_row_column2 = result_df3.loc[result_df3['mse'].idxmin()]
min_row_column2
(p, d, q) (6, 1, 3) AIC 556007.501217 mse 238151857.578634 Name: 21, dtype: object
sarimax_model = SARIMAX(train_data['total_revenue'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
best_predictions3 = sarimax_model.predict(start=test_data.index[0], end=test_data.index[-1])
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
best_predictions_df3 = best_predictions3.reset_index()
best_predictions_df3.set_index('index', inplace= True)
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_data.index, y=train_data['total_revenue'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['total_revenue'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=best_predictions_df3['predicted_mean'], mode='lines', name='ARIMA', line=dict(dash='dash')))
fig.update_layout(
xaxis=dict(title='Time'),
yaxis=dict(title='Total Revenue'),
legend=dict(x=0, y=1),
)
fig.update_xaxes(tickformat="%d")
fig.show()
# Take the first difference to remove to make the process stationary
vendor1_data_diff = vendor1_data - vendor1_data.shift(1)
vendor1_data_diff = vendor1_data_diff.replace([np.inf, -np.inf], np.nan).dropna()
train_size1 = int(len(vendor1_data) * 0.9) # 80% for training, 20% for testing
train_data1 = vendor1_data.iloc[:train_size1]
test_data1 = vendor1_data.iloc[train_size1:]
# result_df1, best_predictions1 = optimize_SARIMAX(train_data['total_revenue'], [(6, 1, 3)], test_data)
sarimax_model1 = SARIMAX(train_data1['total_revenue'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
best_predictions1 = sarimax_model1.predict(start=test_data1.index[0], end=test_data1.index[-1])
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
best_predictions_df1 = best_predictions1.reset_index()
# import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_data1.index, y=train_data1['total_revenue'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data1.index, y=test_data1['total_revenue'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data1.index, y=best_predictions_df1['predicted_mean'], mode='lines', name='SARIMAX', line=dict(dash='dash')))
fig.update_layout(
xaxis=dict(title='Time'),
yaxis=dict(title='Total Revenue'),
legend=dict(x=0, y=1),
)
fig.update_xaxes(tickformat="%m")
fig.show()
arima_model = SARIMAX(train_data['total_revenue'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
decomposition = seasonal_decompose(arima_model.fittedvalues, model='additive', period=24)
fig = go.Figure()
fig = make_subplots(rows=4, cols=1, shared_xaxes=False, subplot_titles=['Fitted', 'Trend', 'Seasonal', 'Residual'])
fig.add_trace(go.Scatter(x=train_data.index, y=arima_model.fittedvalues, name='Fitted'), row=1, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.trend, name='Trend'),row=2, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.seasonal, name='Seasonal'),row=3, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.resid, name='Residual'), row=4, col=1)
fig.update_layout(
height=800, width=1200,
title='Components of SARIMAX',
xaxis_title='Time',
yaxis_title='Value'
)
fig.show()
# Prophet model
prophet_model = Prophet()
prophet_data = train_data.reset_index().rename(columns={'datetime': 'ds', 'total_revenue': 'y'})
prophet_model.fit(prophet_data)
23:50:43 - cmdstanpy - INFO - Chain [1] start processing 23:51:25 - cmdstanpy - INFO - Chain [1] done processing
<prophet.forecaster.Prophet at 0x21d4b4bf4f0>
prophet_predictions = prophet_model.predict(test_data.reset_index().rename(columns={'datetime': 'ds', 'total_revenue': 'y'}))
prophet_predictions = prophet_predictions['yhat'].values
prophet_mse = mean_squared_error(test_data['total_revenue'], prophet_predictions)
print("Prophet Model Performance:")
print(f"MSE: {prophet_mse}")
Prophet Model Performance: MSE: 243614190.98605546
# Perform seasonal decomposition
fig = prophet_model.plot_components(prophet_model.predict(test_data.reset_index().rename(columns={'datetime': 'ds', 'total_revenue': 'y'})))
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_data.index, y=train_data['total_revenue'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['total_revenue'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=prophet_predictions, mode='lines', name='Prophet'))
fig.update_layout(
xaxis=dict(title='Time'),
yaxis=dict(title='Total Revenue'),
legend=dict(x=0, y=1),
)
fig.update_xaxes(tickformat="%d")
fig.show()
prophet_future = prophet_model.make_future_dataframe(periods=30)
prophet_forecast = prophet_model.predict(prophet_future)
fig = go.Figure()
fig.add_trace(go.Scatter(x=prophet_forecast.ds, y=prophet_forecast['yhat'].values, mode='lines', name='Prophet'))
fig.update_layout(
xaxis=dict(title='Time'),
yaxis=dict(title='Total Revenue'),
legend=dict(x=0, y=1),
)
fig.update_xaxes(tickformat="%y")
fig.show()
sarimax_model3 = SARIMAX(train_data['trip_count'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
best_predictions = sarimax_model3.predict(start=test_data.index[0], end=test_data.index[-1])
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
best_predictions_df = best_predictions.reset_index()
best_predictions_df.set_index('index', inplace= True)
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_data.index, y=train_data['trip_count'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['trip_count'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=best_predictions_df['predicted_mean'], mode='lines', name='ARIMA', line=dict(dash='dash')))
fig.update_layout(
xaxis=dict(title='Time'),
yaxis=dict(title='Total Revenue'),
legend=dict(x=0, y=1),
)
fig.update_xaxes(tickformat="%d")
fig.show()
arima_model = SARIMAX(train_data['trip_count'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
decomposition = seasonal_decompose(arima_model.fittedvalues, model='additive', period=24)
fig = go.Figure()
fig = make_subplots(rows=4, cols=1, shared_xaxes=False, subplot_titles=['Fitted', 'Trend', 'Seasonal', 'Residual'])
fig.add_trace(go.Scatter(x=train_data.index, y=arima_model.fittedvalues, name='Fitted'), row=1, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.trend, name='Trend'),row=2, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.seasonal, name='Seasonal'),row=3, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.resid, name='Residual'), row=4, col=1)
fig.update_layout(
height=800, width=1200,
title='Components of SARIMAX',
xaxis_title='Time',
yaxis_title='Value'
)
fig.show()
from statsmodels.tsa import stattools
adf, pvalue, _, _, critical_values, _ = stattools.adfuller(vendor2_data['total_revenue'],
autolag='AIC')
print('Probably ' + ('Non-Stationary' if pvalue > 0.05 else 'Stationary'))
print(pvalue)
Probably Stationary 5.6842433239422795e-06
train_size = int(len(vendor2_data) * 0.8) # 80% for training, 20% for testing
train_data = vendor2_data.iloc[:train_size]
test_data = vendor2_data.iloc[train_size:]
sarimax_model4 = SARIMAX(train_data['total_revenue'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
best_predictions = sarimax_model4.predict(start=test_data.index[0], end=test_data.index[-1])
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
best_predictions_df = best_predictions.reset_index()
best_predictions_df.set_index('index', inplace= True)
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_data.index, y=train_data['total_revenue'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['total_revenue'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=best_predictions_df['predicted_mean'], mode='lines', name='ARIMA', line=dict(dash='dash')))
fig.update_layout(
xaxis=dict(title='Time'),
yaxis=dict(title='Total Revenue'),
legend=dict(x=0, y=1),
)
fig.update_xaxes(tickformat="%d")
fig.show()
sarimax_model4 = SARIMAX(train_data['trip_count'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
best_predictions = sarimax_model4.predict(start=test_data.index[0], end=test_data.index[-1])
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency H will be used. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
best_predictions_df = best_predictions.reset_index()
best_predictions_df.set_index('index', inplace= True)
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_data.index, y=train_data['trip_count'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['trip_count'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=best_predictions_df['predicted_mean'], mode='lines', name='ARIMA', line=dict(dash='dash')))
fig.update_layout(
xaxis=dict(title='Time'),
yaxis=dict(title='trip_count'),
legend=dict(x=0, y=1),
)
fig.update_xaxes(tickformat="%d")
fig.show()
# Prophet model
prophet_model = Prophet()
prophet_data = train_data.reset_index().rename(columns={'datetime': 'ds', 'trip_count': 'y'})
prophet_model.fit(prophet_data)
00:01:09 - cmdstanpy - INFO - Chain [1] start processing 00:01:40 - cmdstanpy - INFO - Chain [1] done processing
<prophet.forecaster.Prophet at 0x21d77072e90>
prophet_predictions = prophet_model.predict(test_data.reset_index().rename(columns={'datetime': 'ds', 'trip_count': 'y'}))
prophet_predictions2 = prophet_predictions['yhat'].values
prophet_mse = mean_squared_error(test_data['trip_count'], prophet_predictions2)
print("Prophet Model Performance:")
print(f"MSE: {prophet_mse}")
Prophet Model Performance: MSE: 775593.7134831949
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_data.index, y=train_data['trip_count'], name='Trip Count'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['trip_count'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=prophet_predictions2, mode='lines', name='Prophet'))
fig.update_layout(
xaxis=dict(title='Time'),
yaxis=dict(title='Trip Count'),
legend=dict(x=0, y=1),
)
fig.update_xaxes(tickformat="%m")
fig.show()
fig = prophet_model.plot_components(prophet_predictions)